library(dplyr)
library(readr)
library(ggplot2)
library(openxlsx)
library(knitr)
library(tibble)
library(stringr)
library(stringi)
library(readxl)
library(lubridate)
library(shiny)
library(plotly)
library(ruODK)
Loading the Data and Removal of Training Data
# Unzip and extract ODK data from ODK zip
df <- export_load_from_odk(params$svc)
## <ruODK settings>
## Default ODK Central Project ID: 2
## Default ODK Central Form ID: 02-TIMCI-SPA-CGEI
## Default ODK Central URL: https://timicodktest.smartforest.de
## Default ODK Central Username: lucas.silbernagel@swisstph.ch
## Default ODK Central Password: run ruODK::get_default_pw() to show
## Default ODK Central Passphrase: run ruODK::get_default_pp() to show
## Default Time Zone: Europe/Berlin
## Default ODK Central Version: 1.1
## Default HTTP GET retries: 3
## Verbose messages: TRUE
## Test ODK Central Project ID:
## Test ODK Central Form ID:
## Test ODK Central Form ID (ZIP tests):
## Test ODK Central Form ID (Attachment tests):
## Test ODK Central Form ID (Parsing tests):
## Test ODK Central Form ID (WKT tests):
## Test ODK Central URL:
## Test ODK Central Username:
## Test ODK Central Password: run ruODK::get_test_pw() to show
## Test ODK Central Passphrase: run ruODK::get_test_pp() to show
## Test ODK Central Version: 1.1
# Formatting dates from integer (in ms) to time stamp
df$start <- format_date_ms(df$start)
df$end <- format_date_ms(df$end)
head(df)
| uuid:2025d106-f4a6-423e-a8cb-0ad9ee3d4f65 |
form start |
NA |
2021-02-02 15:18:42 |
NA |
NA |
NA |
NA |
NA |
NA |
| uuid:2025d106-f4a6-423e-a8cb-0ad9ee3d4f65 |
group questions |
/data/front_page |
2021-02-02 15:18:42 |
2021-02-02 15:18:43 |
NA |
NA |
NA |
NA |
NA |
| uuid:2025d106-f4a6-423e-a8cb-0ad9ee3d4f65 |
form resume |
NA |
2021-02-09 09:09:37 |
NA |
NA |
NA |
NA |
NA |
NA |
| uuid:2025d106-f4a6-423e-a8cb-0ad9ee3d4f65 |
jump |
NA |
2021-02-09 09:09:37 |
2021-02-09 09:09:51 |
NA |
NA |
NA |
NA |
NA |
| uuid:2025d106-f4a6-423e-a8cb-0ad9ee3d4f65 |
group questions |
/data/g5 |
2021-02-09 09:09:48 |
2021-02-09 09:09:51 |
NA |
NA |
NA |
NA |
NA |
| uuid:2025d106-f4a6-423e-a8cb-0ad9ee3d4f65 |
group questions |
/data/a1 |
2021-02-09 09:09:51 |
2021-02-09 09:09:57 |
NA |
NA |
NA |
NA |
NA |
# filtering for events that occurred after 18th July 21
#df <- subset(df, as.Date(start) > as.Date("18.07.2021", "%d.%m.%Y"))
Deriving New Features
Time Spent per Event
# subtracting end from start date
df$time_spent = round(as.numeric(df$end - df$start))
Question
# splitting the node strings so that only the question name remains
df$question = sapply(df$node, create_question)
Question Decoded
df <- decode_question(df, df$question, params$svc)
## <ruODK settings>
## Default ODK Central Project ID: 2
## Default ODK Central Form ID: 02-TIMCI-SPA-CGEI
## Default ODK Central URL: https://timicodktest.smartforest.de
## Default ODK Central Username: lucas.silbernagel@swisstph.ch
## Default ODK Central Password: run ruODK::get_default_pw() to show
## Default ODK Central Passphrase: run ruODK::get_default_pp() to show
## Default Time Zone: Europe/Berlin
## Default ODK Central Version: 1.1
## Default HTTP GET retries: 3
## Verbose messages: TRUE
## Test ODK Central Project ID:
## Test ODK Central Form ID:
## Test ODK Central Form ID (ZIP tests):
## Test ODK Central Form ID (Attachment tests):
## Test ODK Central Form ID (Parsing tests):
## Test ODK Central Form ID (WKT tests):
## Test ODK Central URL:
## Test ODK Central Username:
## Test ODK Central Password: run ruODK::get_test_pw() to show
## Test ODK Central Passphrase: run ruODK::get_test_pp() to show
## Test ODK Central Version: 1.1
Categorical Answers Decoded
df <- decode_categories(df, params$svc)
## <ruODK settings>
## Default ODK Central Project ID: 2
## Default ODK Central Form ID: 02-TIMCI-SPA-CGEI
## Default ODK Central URL: https://timicodktest.smartforest.de
## Default ODK Central Username: lucas.silbernagel@swisstph.ch
## Default ODK Central Password: run ruODK::get_default_pw() to show
## Default ODK Central Passphrase: run ruODK::get_default_pp() to show
## Default Time Zone: Europe/Berlin
## Default ODK Central Version: 1.1
## Default HTTP GET retries: 3
## Verbose messages: TRUE
## Test ODK Central Project ID:
## Test ODK Central Form ID:
## Test ODK Central Form ID (ZIP tests):
## Test ODK Central Form ID (Attachment tests):
## Test ODK Central Form ID (Parsing tests):
## Test ODK Central Form ID (WKT tests):
## Test ODK Central URL:
## Test ODK Central Username:
## Test ODK Central Password: run ruODK::get_test_pw() to show
## Test ODK Central Passphrase: run ruODK::get_test_pp() to show
## Test ODK Central Version: 1.1
Time until a Response was Changed + Stream of Answer Changes
df <- df %>%
# bringing the data in the right order
arrange(instance.ID, node, start) %>%
# adding two empty columns to store the new features in
add_column(time_till_change=NA) %>%
add_column(changed_from=NA)
# iterating over the df and computing the time it took until an answer was changed + adding what the question was before
for (i in 1:nrow(df)){
if (df$old.value[i]==df$new.value[i-1] && !is.na(df$old.value[i]) && !is.na(df$new.value[i-1]) ){
df$time_till_change[i] <- round(as.numeric(df$start[i]-df$end[i-1]))
} else{
next
}
}
Preview and Summary of the Final Data
head(df)
| uuid:2025d106-f4a6-423e-a8cb-0ad9ee3d4f65 |
group questions |
/data/a1 |
2021-02-09 09:09:51 |
2021-02-09 09:09:57 |
NA |
NA |
NA |
NA |
NA |
6 |
a1 |
Participant identification |
NA |
NA |
NA |
NA |
| uuid:2025d106-f4a6-423e-a8cb-0ad9ee3d4f65 |
group questions |
/data/a1 |
2021-02-09 09:10:16 |
2021-02-09 09:10:52 |
NA |
NA |
NA |
NA |
NA |
36 |
a1 |
Participant identification |
NA |
NA |
NA |
NA |
| uuid:2025d106-f4a6-423e-a8cb-0ad9ee3d4f65 |
question |
/data/a1/a1_a_4 |
2021-02-09 09:10:16 |
2021-02-09 09:10:52 |
NA |
NA |
NA |
NA |
K-F019-P0106 |
36 |
a1_a_4 |
Please scan the participant’s QR code |
K-F019-P0106 |
NA |
NA |
NA |
| uuid:2025d106-f4a6-423e-a8cb-0ad9ee3d4f65 |
group questions |
/data/b1 |
2021-02-09 09:09:57 |
2021-02-09 09:10:03 |
NA |
NA |
NA |
NA |
NA |
6 |
b1 |
Facility identification |
NA |
NA |
NA |
NA |
| uuid:2025d106-f4a6-423e-a8cb-0ad9ee3d4f65 |
group questions |
/data/b1 |
2021-02-09 09:10:08 |
2021-02-09 09:10:16 |
NA |
NA |
NA |
NA |
NA |
8 |
b1 |
Facility identification |
NA |
NA |
NA |
NA |
| uuid:2025d106-f4a6-423e-a8cb-0ad9ee3d4f65 |
question |
/data/b1/b1_4 |
2021-02-09 09:10:08 |
2021-02-09 09:10:16 |
NA |
NA |
NA |
NA |
Mbour |
8 |
b1_4 |
Please select the current district |
Mbour |
NA |
NA |
NA |
summary(df)
## instance.ID event node start
## Length:77 Length:77 Length:77 Min. :2021-02-02 15:18:42
## Class :character Class :character Class :character 1st Qu.:2021-02-09 09:11:02
## Mode :character Mode :character Mode :character Median :2021-02-09 09:45:45
## Mean :2021-04-10 07:29:35
## 3rd Qu.:2021-06-21 13:26:01
## Max. :2021-06-21 13:26:23
##
## end latitude longitude accuracy old.value
## Min. :2021-02-02 15:18:43 Mode:logical Mode:logical Mode:logical Mode:logical
## 1st Qu.:2021-02-09 09:11:52 NA's:77 NA's:77 NA's:77 NA's:77
## Median :2021-02-09 09:17:39
## Mean :2021-04-11 11:04:47
## 3rd Qu.:2021-06-21 13:26:06
## Max. :2021-06-21 13:26:23
## NA's :10
## new.value time_spent question question_decoded new_value_decoded
## Length:77 Min. : 2.00 Length:77 Length:77 Length:77
## Class :character 1st Qu.: 5.00 Class :character Class :character Class :character
## Mode :character Median : 8.00 Mode :character Mode :character Mode :character
## Mean : 33.75
## 3rd Qu.: 20.00
## Max. :236.00
## NA's :10
## old_value_decoded time_till_change changed_from
## Mode:logical Mode:logical Mode:logical
## NA's:77 NA's:77 NA's:77
##
##
##
##
##
Grouped by Time
Events/Questions Started by Day
df_by_day <- df %>%
mutate(start_date = as.Date(start)) %>%
count(start_date, name = "count")
gg1 <- ggplot(df_by_day, aes(x = start_date, y = count)) +
geom_line() +
geom_smooth(alpha=0.5, colour="red", method="loess", se=F) +
labs(title = "Number of Events/Questions Started by Day with Smoothed Regression Line", y = "Number of Questions/Events Started", x = "Satrt Date") +
theme_light()
gg1

Questions/Events started by Weekday and Hour of the Day
df_wday_hour <- df %>%
mutate(wday=wday(start, label=T, week_start = 1), hour=hour(start)) %>%
count(wday, hour, name="count_wday_hour") %>%
arrange(desc(wday))
theme_heatmap <- theme_light() +
theme(panel.grid = element_blank(),
panel.border = element_blank(),
plot.title = element_text(face = "bold", size = 11, hjust = 0.5),
axis.ticks = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_text(size=10),
axis.text.y = element_text(size = 8),
axis.text.x = element_text(size = 10),
legend.position = "none")
gg2 <- ggplot(df_wday_hour, aes(x = wday, y = hour, fill = count_wday_hour)) +
geom_tile(colour="white") +
scale_fill_gradient(low = "#fff0f0", high="#940606") +
scale_y_reverse(breaks=c(23:0), labels=c(23:0), expand = c(0,0)) +
scale_x_discrete(expand = c(0,0), position = "top") +
labs(title = "Number of Started Events/Questions by Day of Week / Hour of Day", y = "Hour of Day") +
geom_text(aes(label = count_wday_hour), size = 2) +
theme_heatmap
gg2

Distribution of Time Spent per Event/Question with largest 5 % removed
df_clean = subset(df, time_spent<quantile(df$time_spent,0.95, na.rm=TRUE))
hist(df_clean$time_spent[!is.na(df_clean$time_spent)]/60, breaks=20, xlab = "Time Spent in Minutes", main = "Histogram of the Time Spent by Question")

Aggregated by Event/Question
Count of Old-New Value Pairs
df_stream <- df %>%
filter(!is.na(time_till_change)) %>%
count(question_decoded,
old_value_decoded,
new_value_decoded,
name="count_value_pairs",
sort=TRUE) %>%
filter(count_value_pairs > 1)
df_stream
Aggregated by Instance
Top 10 % of Duration by Instance
df_duration_per_inst <- df %>%
group_by(instance.ID) %>%
summarise(duration_per_inst = max(end, na.rm=T) - min(start, na.rm=T)) %>%
filter(duration_per_inst>quantile(duration_per_inst, 0.9, na.rm=TRUE)) %>%
mutate(duration_per_inst = round(seconds_to_period(duration_per_inst))) %>%
arrange(desc(duration_per_inst))
df_duration_per_inst
| uuid:2025d106-f4a6-423e-a8cb-0ad9ee3d4f65 |
6d 17H 58M 60S |
Distribution of Duration by Instance with Top 10 % excluded
df_subsetted <- df %>%
group_by(instance.ID) %>%
summarise(duration_per_inst = max(end, na.rm=T) - min(start, na.rm=T)) %>%
filter(duration_per_inst<quantile(duration_per_inst, 0.9, na.rm=TRUE))
hist(as.numeric(df_subsetted$duration_per_inst/60), breaks=30, main="Duration per Instance in Minutes (outliers removed)", xlab="Duration in Minutes")

Irregularities and Outliers
Time Till Change Outliers (for all data without removed outliers)
df_time_till_change_outliers <- df %>%
filter(time_till_change>quantile(df$time_till_change, 0.9, na.rm=TRUE)) %>%
arrange(desc(time_till_change)) %>%
mutate(time_till_change = round(seconds_to_period(time_till_change))) %>%
select(instance.ID,
question_decoded,
old_value_decoded,
new_value_decoded,
time_till_change)
df_time_till_change_outliers
Histograms of Instances with Inconsistent Filling Behaviour
irregular_inst = c()
for (id in unique(df$instance.ID)){
bin_vec = cut(df$start[df$instance.ID==id],
breaks=10,
labels=F)
if (length(unique(bin_vec)) < 5) irregular_inst = c(irregular_inst, id)
}
paste0(length(irregular_inst), " out of ", length(unique(df$instance.ID))," instances were found to have an inconsistent filling behaviour.")
## [1] "1 out of 2 instances were found to have an inconsistent filling behaviour."
last_bin_questions = c()
fig <- plot_ly(alpha=0.1)
for (id in irregular_inst){
temp_df = df[df$instance.ID==id,]
temp_df$cut = cut(temp_df$start, breaks=10, labels=c("1. Part", "2. Part", "3. Part", "4. Part", "5. Part", "6. Part", "7. Part", "8. Part", "9. Part", "10. Part"))
fig <- fig %>% add_histogram(x=temp_df$cut, name=id)
last_bin_questions = c(last_bin_questions, temp_df$question_decoded[temp_df$cut=="10. Part"])
}
fig <- fig %>% layout(barmode = "overlay")
fig
kable(table(last_bin_questions) %>% as.data.frame() %>% arrange(desc(Freq)))
| Facility identification |
2 |
| Participant identification |
2 |
| Can you show me all the medicines and prescriptions that you received? |
1 |
| Consultation satisfaction |
1 |
| Cost |
1 |
| Counselling and follow-up advice |
1 |
| Did the provider explain to you how to give these medicines to the child at home? |
1 |
| Did the provider give or prescribe any medicines for the child to take home? |
1 |
| Did the provider refer the child? |
1 |
| Did the provider speak in a language you understand? |
1 |
| Did the provider tell you what illness your child has? |
1 |
| Did you feel the provider treated you and the child with respect? |
1 |
| Did you find the provider showed concern and empathy? |
1 |
| Did you find the provider was kind to you? |
1 |
| Did you miss work to bring the child to the facility today? |
1 |
| Did you pay for something at the facility today? |
1 |
| Do you intend to buy some medicines outside of the facility? |
1 |
| front_page |
1 |
| How confident do you feel in how much of the medication to give each day and how many days to give it? |
1 |
| How do you feel overall with the service you received at the facility today? |
1 |
| Is this facility the closest health facility to your home? |
1 |
| Please scan the participant’s QR code |
1 |
| Please select the current district |
1 |
| Treatment |
1 |
| Was the service delayed or were you kept waiting for a long time? |
1 |
| Were you given general information or advice about feeding or breastfeeding? |
1 |
| Were you informed of signs / symptoms that require you to bring the child back to the facility immediately? |
1 |
| What do you intend to do if the sick child does not get completely better or become worse? |
1 |
| Would you recommend this facility to a friend / family with a sick child? |
1 |